Task 1

In this task we are asked to select 5 bookmakers to see if over/under 2.5 game result can be explained by the odds. I selected William Hill, 888sport, 188BET, SportigBet and Tipico. Below code shows the data preparation.

require(data.table)
require(anytime)
require(plotly)
require(plyr)
require(MASS)
require(imputeTS)
require(jpeg)

set.seed(12)


matches<-data.table(readRDS("df9b1196-e3cf-4cc7-9159-f236fe738215_matches.RDS"))
odds<-data.table(readRDS("df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.RDS"))
matches=unique(matches)

#Selecting Soccer Games

odds<-odds[matchId %in% matches$matchId]

#Renaming the over under bets based on their threshold
odds[oddtype=="over",oddtype:= paste("over_",as.character(totalhandicap))]
odds[oddtype=="under",oddtype:= paste("under_",as.character(totalhandicap))]

matches[,match_date:=anydate(date)]
matches[,match_time:=anytime(date)]
matches=matches[order(home,-match_time)]
matches[,c("match_date","date"):=NULL]
matches[,c("HomeGoals","AwayGoals"):=tstrsplit(score,':')]
matches$HomeGoals=as.numeric(matches$HomeGoals)
matches$AwayGoals=as.numeric(matches$AwayGoals)
matches[,TotalGoals:=HomeGoals+AwayGoals]
matches[,IsOver:=0]
matches[TotalGoals>2,IsOver:=1]
matches=matches[complete.cases(matches)]
matches[,homewin:=0]
#Finding out about who won 0 is draw, 1 is home win and 2 is away win

matches[HomeGoals>AwayGoals,homewin:=1]
matches[HomeGoals<AwayGoals,homewin:=2]

matches[,Year:=year(match_time)]
matches[,Month:=month(match_time)]
matches[,Weekday:=wday(match_time)]
matches[,Hour:=hour(match_time)]


#Getting the Final odds
odds_a=odds[order(matchId, oddtype,bookmaker,date)]


odds_a=odds_a[,list(odd=odd[.N]),
                            by=list(matchId,oddtype,bookmaker)]


odds_a_William=odds_a[bookmaker=='William Hill' ]
odds_a_888sport=odds_a[bookmaker=='888sport' ]
odds_a_188BET=odds_a[bookmaker=='188BET' ]
odds_a_Sportingbet=odds_a[bookmaker=='Sportingbet' ]
odds_a_Tipico=odds_a[bookmaker=='Tipico' ]


odds_a_wide_William=dcast(odds_a_William,
                  matchId~oddtype,
                  value.var='odd')
odds_a_wide_888sport=dcast(odds_a_888sport,
                          matchId~oddtype,
                          value.var='odd')
odds_a_wide_188BET=dcast(odds_a_188BET,
                          matchId~oddtype,
                          value.var='odd')
odds_a_wide_Sportingbet=dcast(odds_a_Sportingbet,
                          matchId~oddtype,
                          value.var='odd')
odds_a_wide_Tipico=dcast(odds_a_Tipico,
                          matchId~oddtype,
                          value.var='odd')

colnames(odds_a_wide_William)[2:23]=paste(colnames(odds_a_wide_William)[2:23],"_W")
colnames(odds_a_wide_888sport)[2:39]=paste(colnames(odds_a_wide_888sport)[2:39],"_8")
colnames(odds_a_wide_188BET)[2:37]=paste(colnames(odds_a_wide_188BET)[2:37],"_B")
colnames(odds_a_wide_Sportingbet)[2:25]=paste(colnames(odds_a_wide_Sportingbet)[2:25],"_S")
colnames(odds_a_wide_Tipico)[2:23]=paste(colnames(odds_a_wide_Tipico)[2:23],"_T")


merged_matches=merge(matches,odds_a_wide_888sport,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_William,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_188BET,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_Sportingbet,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_Tipico,by='matchId')


Over_Under<-merged_matches$IsOver
Home_win<-merged_matches$homewin
merged_matches_1=merged_matches[,c("leagueId","home","homewin","away","type","match_time","Year","Month","Weekday","Hour","score","HomeGoals","AwayGoals","TotalGoals","matchId","IsOver"):=NULL]

#Replacing NA's with column means
merged_matches_1=merged_matches_1[ ,lapply(.SD, na.mean)]

Before applying PCA, I scaled the data with its range. Then, I applied the PCA.

merged_matches_1=(merged_matches_1-min(merged_matches_1))/(max(merged_matches_1)-min(merged_matches_1))

pca_m=princomp(merged_matches_1)


summary(pca_m)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     0.3007892 0.1581031 0.10570677 0.08834488
## Proportion of Variance 0.5761426 0.1591791 0.07115594 0.04970137
## Cumulative Proportion  0.5761426 0.7353217 0.80647769 0.85617905
##                            Comp.5     Comp.6     Comp.7     Comp.8
## Standard deviation     0.07894591 0.04924815 0.04525255 0.04071873
## Proportion of Variance 0.03968851 0.01544491 0.01304042 0.01055830
## Cumulative Proportion  0.89586756 0.91131248 0.92435290 0.93491120
##                             Comp.9     Comp.10     Comp.11     Comp.12
## Standard deviation     0.039320644 0.035823324 0.032489294 0.029178092
## Proportion of Variance 0.009845705 0.008172169 0.006721808 0.005421499
## Cumulative Proportion  0.944756906 0.952929075 0.959650884 0.965072382
##                            Comp.13     Comp.14     Comp.15     Comp.16
## Standard deviation     0.026013609 0.023075414 0.020732736 0.017328194
## Proportion of Variance 0.004309301 0.003390819 0.002737277 0.001912107
## Cumulative Proportion  0.969381683 0.972772502 0.975509779 0.977421886
##                            Comp.17     Comp.18     Comp.19     Comp.20
## Standard deviation     0.016841744 0.016463476 0.016041904 0.015212012
## Proportion of Variance 0.001806258 0.001726031 0.001638768 0.001473598
## Cumulative Proportion  0.979228144 0.980954175 0.982592943 0.984066541
##                            Comp.21     Comp.22     Comp.23     Comp.24
## Standard deviation     0.014230220 0.014032020 0.013624282 0.013366662
## Proportion of Variance 0.001289522 0.001253851 0.001182042 0.001137762
## Cumulative Proportion  0.985356063 0.986609914 0.987791956 0.988929718
##                            Comp.25      Comp.26      Comp.27      Comp.28
## Standard deviation     0.012606192 0.0122817307 0.0116032365 0.0111986748
## Proportion of Variance 0.001011983 0.0009605605 0.0008573613 0.0007986175
## Cumulative Proportion  0.989941701 0.9909022618 0.9917596231 0.9925582406
##                             Comp.29      Comp.30      Comp.31      Comp.32
## Standard deviation     0.0104162406 0.0101279968 0.0093183474 0.0087181837
## Proportion of Variance 0.0006909197 0.0006532098 0.0005529468 0.0004840137
## Cumulative Proportion  0.9932491603 0.9939023701 0.9944553170 0.9949393306
##                             Comp.33      Comp.34      Comp.35      Comp.36
## Standard deviation     0.0078070231 0.0076256506 0.0072447283 0.0069462065
## Proportion of Variance 0.0003881294 0.0003703048 0.0003342334 0.0003072564
## Cumulative Proportion  0.9953274600 0.9956977648 0.9960319982 0.9963392546
##                            Comp.37      Comp.38     Comp.39      Comp.40
## Standard deviation     0.006801062 0.0064614961 0.006266781 0.0060718329
## Proportion of Variance 0.000294550 0.0002658715 0.000250089 0.0002347714
## Cumulative Proportion  0.996633805 0.9968996761 0.997149765 0.9973845366
##                             Comp.41     Comp.42      Comp.43      Comp.44
## Standard deviation     0.0058622889 0.005479961 0.0050945806 0.0049374300
## Proportion of Variance 0.0002188467 0.000191232 0.0001652808 0.0001552414
## Cumulative Proportion  0.9976033833 0.997794615 0.9979598961 0.9981151375
##                             Comp.45      Comp.46      Comp.47      Comp.48
## Standard deviation     0.0046172576 0.0045218804 0.0044364377 0.0042413804
## Proportion of Variance 0.0001357606 0.0001302098 0.0001253356 0.0001145566
## Cumulative Proportion  0.9982508981 0.9983811079 0.9985064435 0.9986210001
##                            Comp.49      Comp.50      Comp.51      Comp.52
## Standard deviation     0.004103624 3.919743e-03 3.891632e-03 3.703321e-03
## Proportion of Variance 0.000107236 9.784097e-05 9.644266e-05 8.733498e-05
## Cumulative Proportion  0.998728236 9.988261e-01 9.989225e-01 9.990099e-01
##                             Comp.53      Comp.54      Comp.55      Comp.56
## Standard deviation     3.682669e-03 3.514170e-03 0.0033995795 3.256081e-03
## Proportion of Variance 8.636362e-05 7.864136e-05 0.0000735963 6.751435e-05
## Cumulative Proportion  9.990962e-01 9.991749e-01 0.9992484560 9.993160e-01
##                             Comp.57      Comp.58      Comp.59      Comp.60
## Standard deviation     0.0031502480 3.059198e-03 2.969977e-03 2.908556e-03
## Proportion of Variance 0.0000631968 5.959649e-05 5.617094e-05 5.387168e-05
## Cumulative Proportion  0.9993791671 9.994388e-01 9.994949e-01 9.995488e-01
##                             Comp.61      Comp.62      Comp.63      Comp.64
## Standard deviation     2.790180e-03 2.673921e-03 2.533314e-03 0.0024155973
## Proportion of Variance 4.957584e-05 4.553054e-05 4.086801e-05 0.0000371582
## Cumulative Proportion  9.995984e-01 9.996439e-01 9.996848e-01 0.9997219388
##                             Comp.65      Comp.66      Comp.67      Comp.68
## Standard deviation     2.302658e-03 0.0019398604 1.820301e-03 1.742125e-03
## Proportion of Variance 3.376483e-05 0.0000239633 2.110046e-05 1.932698e-05
## Cumulative Proportion  9.997557e-01 0.9997796669 9.998008e-01 9.998201e-01
##                             Comp.69      Comp.70      Comp.71      Comp.72
## Standard deviation     1.634792e-03 1.609001e-03 1.521542e-03 1.496466e-03
## Proportion of Variance 1.701887e-05 1.648611e-05 1.474259e-05 1.426066e-05
## Cumulative Proportion  9.998371e-01 9.998536e-01 9.998683e-01 9.998826e-01
##                             Comp.73      Comp.74      Comp.75      Comp.76
## Standard deviation     1.490756e-03 1.406883e-03 1.298971e-03 1.213208e-03
## Proportion of Variance 1.415203e-05 1.260438e-05 1.074496e-05 9.372938e-06
## Cumulative Proportion  9.998968e-01 9.999094e-01 9.999201e-01 9.999295e-01
##                             Comp.77      Comp.78      Comp.79      Comp.80
## Standard deviation     1.174271e-03 9.812767e-04 9.261688e-04 8.841621e-04
## Proportion of Variance 8.780961e-06 6.131808e-06 5.462430e-06 4.978166e-06
## Cumulative Proportion  9.999383e-01 9.999444e-01 9.999499e-01 9.999548e-01
##                             Comp.81      Comp.82      Comp.83      Comp.84
## Standard deviation     8.242017e-04 7.837303e-04 7.404076e-04 6.966572e-04
## Proportion of Variance 4.325861e-06 3.911460e-06 3.490980e-06 3.090607e-06
## Cumulative Proportion  9.999592e-01 9.999631e-01 9.999666e-01 9.999696e-01
##                             Comp.85      Comp.86      Comp.87      Comp.88
## Standard deviation     6.877674e-04 6.422308e-04 6.157956e-04 5.893212e-04
## Proportion of Variance 3.012234e-06 2.626563e-06 2.414787e-06 2.211616e-06
## Cumulative Proportion  9.999727e-01 9.999753e-01 9.999777e-01 9.999799e-01
##                             Comp.89      Comp.90      Comp.91      Comp.92
## Standard deviation     5.676149e-04 5.405344e-04 4.647236e-04 4.511995e-04
## Proportion of Variance 2.051697e-06 1.860597e-06 1.375292e-06 1.296411e-06
## Cumulative Proportion  9.999820e-01 9.999838e-01 9.999852e-01 9.999865e-01
##                             Comp.93      Comp.94      Comp.95      Comp.96
## Standard deviation     4.441513e-04 4.336323e-04 4.218790e-04 4.122193e-04
## Proportion of Variance 1.256225e-06 1.197426e-06 1.133395e-06 1.082087e-06
## Cumulative Proportion  9.999878e-01 9.999890e-01 9.999901e-01 9.999912e-01
##                             Comp.97      Comp.98      Comp.99     Comp.100
## Standard deviation     3.829616e-04 3.501240e-04 3.345040e-04 3.168224e-04
## Proportion of Variance 9.339336e-07 7.806373e-07 7.125382e-07 6.392008e-07
## Cumulative Proportion  9.999921e-01 9.999929e-01 9.999936e-01 9.999942e-01
##                            Comp.101     Comp.102     Comp.103     Comp.104
## Standard deviation     3.038974e-04 2.847233e-04 2.784526e-04 2.693517e-04
## Proportion of Variance 5.881112e-07 5.162399e-07 4.937511e-07 4.620034e-07
## Cumulative Proportion  9.999948e-01 9.999953e-01 9.999958e-01 9.999963e-01
##                            Comp.105     Comp.106     Comp.107     Comp.108
## Standard deviation     2.629394e-04 2.505095e-04 2.356833e-04 2.235829e-04
## Proportion of Variance 4.402677e-07 3.996262e-07 3.537229e-07 3.183338e-07
## Cumulative Proportion  9.999967e-01 9.999971e-01 9.999975e-01 9.999978e-01
##                            Comp.109     Comp.110     Comp.111     Comp.112
## Standard deviation     2.091962e-04 1.959772e-04 1.876953e-04 1.796528e-04
## Proportion of Variance 2.786847e-07 2.445776e-07 2.243430e-07 2.055292e-07
## Cumulative Proportion  9.999981e-01 9.999983e-01 9.999986e-01 9.999988e-01
##                            Comp.113     Comp.114     Comp.115     Comp.116
## Standard deviation     1.704274e-04 1.627868e-04 1.535437e-04 1.405270e-04
## Proportion of Variance 1.849628e-07 1.687501e-07 1.501307e-07 1.257549e-07
## Cumulative Proportion  9.999989e-01 9.999991e-01 9.999993e-01 9.999994e-01
##                            Comp.117     Comp.118     Comp.119     Comp.120
## Standard deviation     1.324027e-04 1.163779e-04 1.044384e-04 1.036092e-04
## Proportion of Variance 1.116348e-07 8.624757e-08 6.945860e-08 6.836002e-08
## Cumulative Proportion  9.999995e-01 9.999996e-01 9.999997e-01 9.999997e-01
##                            Comp.121     Comp.122     Comp.123     Comp.124
## Standard deviation     8.749634e-05 7.782679e-05 7.687877e-05 7.468309e-05
## Proportion of Variance 4.875120e-08 3.857126e-08 3.763730e-08 3.551813e-08
## Cumulative Proportion  9.999998e-01 9.999998e-01 9.999998e-01 9.999999e-01
##                            Comp.125     Comp.126     Comp.127     Comp.128
## Standard deviation     7.096385e-05 6.456865e-05 6.076409e-05 4.474025e-05
## Proportion of Variance 3.206859e-08 2.654905e-08 2.351254e-08 1.274683e-08
## Cumulative Proportion  9.999999e-01 9.999999e-01 1.000000e+00 1.000000e+00
##                            Comp.129     Comp.130     Comp.131     Comp.132
## Standard deviation     3.279707e-05 2.660972e-05 2.523517e-05 1.918304e-05
## Proportion of Variance 6.849763e-09 4.509060e-09 4.055254e-09 2.343368e-09
## Cumulative Proportion  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
##                            Comp.133     Comp.134     Comp.135     Comp.136
## Standard deviation     1.139498e-05 1.087581e-05 8.571961e-06 7.778894e-06
## Proportion of Variance 8.268611e-10 7.532328e-10 4.679139e-10 3.853375e-10
## Cumulative Proportion  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
##                            Comp.137     Comp.138     Comp.139     Comp.140
## Standard deviation     4.987844e-06 3.732313e-10 3.375250e-10 4.513394e-11
## Proportion of Variance 1.584278e-10 8.870776e-19 7.254667e-19 1.297216e-20
## Cumulative Proportion  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
##                            Comp.141     Comp.142
## Standard deviation     2.745726e-11 5.352892e-12
## Proportion of Variance 4.800869e-21 1.824662e-22
## Cumulative Proportion  1.000000e+00 1.000000e+00

From the summary, we can see that first two components explain around 73% of the variance. So, with the below code I calcualte the new coordinates based on these two components and plot them with different colors based on the match over under results.

p<-pca_m$scores[,1:2]
p1<-data.table(cbind(Over_Under,p))

plot(p1$Comp.1,p1$Comp.2,col=p1$Over_Under+1)

Based on the plot, I can see no clear seperation between the over under results.

With the below code, I apply MDS to the data, first with euclidean distance and then with Manhattan distance. Data is already scaled so, I don’t have to worry about that.

euc_dist=dist(merged_matches_1)
mds1=cmdscale(euc_dist)
p2<-data.table(cbind(Over_Under,mds1))

plot(p2$V2,p2$V3,col=p2$Over_Under+1)

With the Euclidean distance, I can not see a clear seperation. Interestingly, the coordinates for MDS came out as the same coordinates from PCA multiplied by -1.

man_dist=dist(merged_matches_1,method="manhattan")
mds2=cmdscale(man_dist)
p3<-data.table(cbind(Over_Under,mds2))

plot(p3$V2,p3$V3,col=p3$Over_Under+1)

Manhattan distance gives us a similar shape as Euclidean distance. This time the coordinates are different from PCA. However, still there is no clear seperation between the over under game results.

Task 2

This time, I check whether PCA can help with the seperation of draw, home wins and away wins. With the below code I generate a color coded graph.

p4<-data.table(cbind(Home_win,p))
plot(p4$Comp.1,p4$Comp.2,col=p4$Home_win+1)

Based on the plot, we can see a better seperation between the home wins and away wins ith draws being in between them. 6

Task 3

With the “jpeg” package, I read a picture in R. For quickness sake, my picture was 400x400. First, I used rasterImage to print the picture in R. Then with image function, I plotted the three channels of the image.

pic<-readJPEG("hw2_pic.jpeg")
plot.new()
rasterImage(as.raster(pic),0,0,1,1)

par(mfrow=c(1,3))
image(pic[,,1])
image(pic[,,2])
image(pic[,,3])

With the below code, I added noise to the picture. Then, I scaled the picture back into the 0,1 bound. You can see the noisy image below. ,

n=400*400
pic[,,1]<-pic[,,1]+matrix(runif(n,min = 0, max = 0.1),ncol=400)
pic[,,2]<-pic[,,2]+matrix(runif(n,min = 0, max = 0.1),ncol=400)
pic[,,3]<-pic[,,3]+matrix(runif(n,min = 0, max = 0.1),ncol=400)

pic[,,1]<-(pic[,,1]-min(pic[,,1]))/(max(pic[,,1])-min(pic[,,1]))
pic[,,2]<-(pic[,,2]-min(pic[,,2]))/(max(pic[,,2])-min(pic[,,2]))
pic[,,3]<-(pic[,,3]-min(pic[,,3]))/(max(pic[,,3])-min(pic[,,3]))
plot.new()
par(mfrow=c(1,1))
rasterImage(as.raster(pic),0,0,1,1)

plot.new()
par(mfrow=c(1,3))
image(pic[,,1])
image(pic[,,2])
image(pic[,,3])

Greyscale conversion is done by adding all three channels together and then dividing this summation matrix by its maximum value.

graypic<-pic[,,1]+pic[,,2]+pic[,,3]
graypic<-graypic/max(graypic)
plot.new()
rasterImage(as.raster(graypic),0,0,1,1)

I ran two for loops inside one another to get the 3x3 patches. Since the grayscale picture is already scaled, I didn’t worry about scaling it.

for(i in 2:399){
  for(j in 2:399){
   e<-rbind(e,matrix(graypic[(i-1):(i+1),(j-1):(j+1)],ncol=1))
 }
  
}
e<-matrix(e,nrow=9)
e<-t(e)

The PCA and image plotting is done below. I scaled all the image matrices so that the pixel values is between 0,1.

pca_g=princomp(e)

summary(pca_g)
## Importance of components:
##                           Comp.1     Comp.2     Comp.3     Comp.4
## Standard deviation     0.6517813 0.13004052 0.12455905 0.07238277
## Proportion of Variance 0.8919765 0.03550642 0.03257618 0.01100069
## Cumulative Proportion  0.8919765 0.92748290 0.96005908 0.97105978
##                             Comp.5      Comp.6      Comp.7     Comp.8
## Standard deviation     0.068039264 0.062209051 0.048472971 0.04218271
## Proportion of Variance 0.009720056 0.008125624 0.004933429 0.00373610
## Cumulative Proportion  0.980779833 0.988905457 0.993838886 0.99757499
##                             Comp.9
## Standard deviation     0.033984614
## Proportion of Variance 0.002425014
## Cumulative Proportion  1.000000000
first_c<-matrix(pca_g$scores[,1],nrow=398)
first_c<-(first_c[,]-min(first_c))/(max(first_c)-min(first_c))
summary(pca_g)
## Importance of components:
##                           Comp.1     Comp.2     Comp.3     Comp.4
## Standard deviation     0.6517813 0.13004052 0.12455905 0.07238277
## Proportion of Variance 0.8919765 0.03550642 0.03257618 0.01100069
## Cumulative Proportion  0.8919765 0.92748290 0.96005908 0.97105978
##                             Comp.5      Comp.6      Comp.7     Comp.8
## Standard deviation     0.068039264 0.062209051 0.048472971 0.04218271
## Proportion of Variance 0.009720056 0.008125624 0.004933429 0.00373610
## Cumulative Proportion  0.980779833 0.988905457 0.993838886 0.99757499
##                             Comp.9
## Standard deviation     0.033984614
## Proportion of Variance 0.002425014
## Cumulative Proportion  1.000000000
second_c<-matrix(pca_g$scores[,2],nrow=398)
second_c<-(second_c[,]-min(second_c))/(max(second_c)-min(second_c))


third_c<-matrix(pca_g$scores[,3],nrow=398)
third_c<-(third_c[,]-min(third_c))/(max(third_c)-min(third_c))


{plot(NA, xlim=c(0,11),ylim=c(0,8),type = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "")
rasterImage(as.raster(t(first_c)),0,2,3,6)
rasterImage(as.raster(t(second_c)),4,2,7,6)
rasterImage(as.raster(t(third_c)),8,2,11,6)}

ev1<-matrix(pca_g$loadings[,1],nrow=3)
ev1<-(ev1[,]-min(ev1))/(max(ev1)-min(ev1))


ev2<-matrix(pca_g$loadings[,2],nrow=3)
ev2<-(ev2[,]-min(ev2))/(max(ev2)-min(ev2))


ev3<-matrix(pca_g$loadings[,3],nrow=3)
ev3<-(ev3[,]-min(ev3))/(max(ev3)-min(ev3))


{plot(NA,xlim=c(0,11),ylim=c(0,8), type = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "")
  
rasterImage(as.raster(t(ev1)),0,2,3,6)
rasterImage(as.raster(t(ev2)),4,2,7,6)
rasterImage(as.raster(t(ev3)),8,2,11,6)}

The first component for the PCA seems to be the negative of the picture. The eigenvectors seems to show the intensity in the left, middle and right side of the picture.